#problem 1
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library (readr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
library(mizer)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
urlfile="https://raw.githubusercontent.com/washingtonpost/data-homicides/master/homicide-data.csv"
#change unknown into NA, variable become numeric
homicide_df<-read_csv(url(urlfile), na=c("","Unknown")) %>%
mutate(
city_state=str_c(city, state),
resolution=case_when(
disposition=="Closed without arrest" ~ "unsolved",
disposition=="Open/No arrest" ~ "unsolved",
disposition=="Closed by arrest" ~ "solved"
))%>%
relocate(city_state)%>%
filter(city_state != "TulsaAL")
## Rows: 52179 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): uid, victim_last, victim_first, victim_race, victim_sex, city, stat...
## dbl (4): reported_date, victim_age, lat, lon
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Let’s focus on Baltimore, MD.
baltimore_df=
homicide_df %>%
filter(city_state=="BaltimoreMD")
baltimore_summary=
baltimore_df %>%
summarize(
unsolved=sum(resolution=="unsolved"),
n=n()
)
baltimore_test=
prop.test(x=baltimore_summary %>% pull (unsolved),
n=baltimore_summary %>% pull (n))
baltimore_test %>%
broom::tidy()
## # A tibble: 1 × 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 0.646 239. 6.46e-54 1 0.628 0.663 1-sample… two.sided
#Let’s try to iterate across cities!
First off, write a function and test it on a few sample cities
prop_test_function= function(city_df){
city_summary=
city_df %>%
summarize(
unsolved=sum(resolution=="unsolved"),
n=n()
)
city_test=
prop.test(x=city_summary %>% pull (unsolved),
n=city_summary %>% pull (n))
return(city_test)
}
prop_test_function(baltimore_df)
##
## 1-sample proportions test with continuity correction
##
## data: city_summary %>% pull(unsolved) out of city_summary %>% pull(n), null probability 0.5
## X-squared = 239.01, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.6275625 0.6631599
## sample estimates:
## p
## 0.6455607
homicide_df %>%
filter(city_state=="AlbuquerqueNM") %>%
prop_test_function()
##
## 1-sample proportions test with continuity correction
##
## data: city_summary %>% pull(unsolved) out of city_summary %>% pull(n), null probability 0.5
## X-squared = 19.114, df = 1, p-value = 1.232e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3372604 0.4375766
## sample estimates:
## p
## 0.3862434
Now, let’s iterate acroos all cities
results_df=
homicide_df %>%
nest(data=uid:resolution) %>%
mutate(
test_results=map(data, prop_test_function),
tidy_results=map(test_results, broom::tidy)
)%>%
select(city_state, tidy_results)%>%
unnest(tidy_results)%>%
select(city_state, estimate, starts_with("conf"))
results_df %>%
mutate(city_state=fct_reorder(city_state, estimate))%>%
ggplot(aes(x=city_state, y=estimate))+
geom_point()+
geom_errorbar(aes(ymin=conf.low, ymax=conf.high))+
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))
homicide_df %>%
group_by(city_state)%>%
summarise(unsolved=sum(resolution=="unsolved"),
n=n()
)%>%
mutate(
test_results=map2(unsolved, n, prop.test),
tidy_results=map(test_results,broom::tidy)
)%>%
unnest(tidy_results)%>%
select(city_state, estimate, starts_with("conf"))
## # A tibble: 50 × 4
## city_state estimate conf.low conf.high
## <chr> <dbl> <dbl> <dbl>
## 1 AlbuquerqueNM 0.386 0.337 0.438
## 2 AtlantaGA 0.383 0.353 0.415
## 3 BaltimoreMD 0.646 0.628 0.663
## 4 Baton RougeLA 0.462 0.414 0.511
## 5 BirminghamAL 0.434 0.399 0.469
## 6 BostonMA 0.505 0.465 0.545
## 7 BuffaloNY 0.612 0.569 0.654
## 8 CharlotteNC 0.300 0.266 0.336
## 9 ChicagoIL 0.736 0.724 0.747
## 10 CincinnatiOH 0.445 0.408 0.483
## # … with 40 more rows
##problem 2
file_con<- tibble(file=list.files(path ="/Users/lin/Desktop/data/", pattern = "*con*"))
file_exp<- tibble(file=list.files(path ="/Users/lin/Desktop/data/", pattern = "*exp*"))
datacon <-file_con %>%
map(~ read_csv(file.path("/Users/lin/Desktop/data/", .))) %>%
reduce(rbind) %>%
mutate(type="con")%>%
dplyr::mutate(subject_id = dplyr::row_number()) %>%
relocate(subject_id, type)%>%
arrange(subject_id)
## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataexp <-file_exp %>%
map(~ read_csv(file.path("/Users/lin/Desktop/data/", .))) %>%
reduce(rbind )%>%
mutate(type="exp") %>%
dplyr::mutate(subject_id = dplyr::row_number()) %>%
relocate(subject_id, type)%>%
arrange(subject_id)
## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
arm<-union(dataexp, datacon) %>%
arrange(subject_id)
arm
## # A tibble: 20 × 10
## subject_id type week_1 week_2 week_3 week_4 week_5 week_6 week_7 week_8
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 exp 3.05 3.67 4.84 5.8 6.33 5.46 6.38 5.91
## 2 1 con 0.2 -1.31 0.66 1.96 0.23 1.09 0.05 1.94
## 3 2 exp -0.84 2.63 1.64 2.58 1.24 2.32 3.11 3.78
## 4 2 con 1.13 -0.88 1.07 0.17 -0.83 -0.31 1.58 0.44
## 5 3 exp 2.15 2.08 1.82 2.84 3.36 3.61 3.37 3.74
## 6 3 con 1.77 3.11 2.22 3.26 3.31 0.89 1.88 1.01
## 7 4 exp -0.62 2.54 3.78 2.73 4.49 5.82 6 6.49
## 8 4 con 1.04 3.66 1.22 2.33 1.47 2.7 1.87 1.66
## 9 5 exp 0.7 3.33 5.34 5.57 6.9 6.66 6.24 6.95
## 10 5 con 0.47 -0.58 -0.09 -1.37 -0.32 -2.17 0.45 0.48
## 11 6 exp 3.73 4.08 5.4 6.41 4.87 6.09 7.66 5.83
## 12 6 con 2.37 2.5 1.59 -0.16 2.08 3.07 0.78 2.35
## 13 7 exp 1.18 2.35 1.23 1.17 2.02 1.61 3.13 4.88
## 14 7 con 0.03 1.21 1.13 0.64 0.49 -0.12 -0.07 0.46
## 15 8 exp 1.37 1.43 1.84 3.6 3.8 4.72 4.68 5.7
## 16 8 con -0.08 1.42 0.09 0.36 1.18 -1.16 0.33 -0.44
## 17 9 exp -0.4 1.08 2.66 2.7 2.8 2.64 3.51 3.27
## 18 9 con 0.08 1.24 1.44 0.41 0.95 2.75 0.3 0.03
## 19 10 exp 1.09 2.8 2.8 4.3 2.25 6.57 6.09 4.64
## 20 10 con 2.14 1.15 2.52 3.44 4.26 0.97 2.73 -0.53
df2 <- melt(arm, id.var = c("type", "subject_id"))
df2[,'subject_id'] <- as.factor(as.character(df2[,'subject_id']))
# plot
fig<-plot_ly(df2) %>%
add_lines(x = ~variable, y = ~value,
color = ~subject_id, linetype = ~type)
fig<-fig %>%
layout(title = 'Arm experiment', xaxis = list(title = 'Week'),
yaxis = list(title = 'arm data'),legend = list(title=list(text='control and experimental <br> of observations')))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
##problem 3
library(tidyverse)
set.seed(10)
iris_with_missing = iris %>%
map_df(~replace(.x, sample(1:150, 20), NA)) %>%
mutate(Species = as.character(Species))
write a function
fill_in_missing=function(vector){
if(is.numeric(vector)){
fill_value=round(mean(!is.na(vector)),1)
vector[which(is.na(vector))]=fill_value
}
if(is.character(vector)){
vector[which(is.na(vector))]="virginica"
}
return(vector)
}
iris_with_missing=
iris_with_missing %>%
mutate(
Sepal.Length=map(Sepal.Length, fill_in_missing),
Sepal.Width=map(Sepal.Width,fill_in_missing),
Petal.Length=map(Petal.Length, fill_in_missing),
Petal.Width=map(Petal.Width, fill_in_missing),
Species=map(Species, fill_in_missing)
) %>% unnest(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species)
iris_with_missing
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 0 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 0 0.1 setosa
## # … with 140 more rows